HEAD ======= >>>>>>> 001d6a0d944c1bbe14ed89d19aa435141cdff71a
vote_df = read_csv("./datasets/president_county_candidate.csv")
state_sum = read_csv("./datasets/president_state.csv")
region_df =
read_csv("./datasets/states.csv") %>%
rename(state = State)
election_winner_df =
read_csv("./datasets/president_county_candidate.csv") %>%
group_by(state, party) %>%
mutate(party_total = sum(total_votes)) %>%
ungroup() %>%
group_by(state) %>%
mutate(state_winner = case_when(
party_total == max(party_total) ~ TRUE,
party_total != max(party_total) ~ FALSE),
state_total = sum(total_votes)
)
winner_region =
left_join(election_winner_df, region_df) %>%
filter(state_winner == TRUE) %>%
select(state, candidate, state_total, Region) %>%
distinct()
election_map_df =
election_winner_df %>%
filter(state_winner == TRUE) %>%
mutate(region = tolower(state)) %>%
select(state, candidate, party_total, state_total, region) %>%
distinct()
usa_map = map_data("state")
usa_election_map = left_join(usa_map, election_map_df)
colors <- c("dark red", "dark blue")
names(colors) = c("Donald Trump", "Joe Biden")
election_result_map =
ggplot(data = usa_election_map,
aes(x = long, y = lat,
group = group, fill = candidate,
text = paste("State: ", state ,
"</br></br>Candidate: ", candidate,
"</br>Votes: ", party_total,
"</br>Winning Proportion: ", round(party_total/state_total, 2)))) +
geom_polygon(color = "gray90", size = 0.1) +
labs(title = "Election Results across states") +
scale_fill_manual(values = colors) +
theme_void() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom")
ggplotly(election_result_map, tooltip = "text")
<<<<<<< HEAD
=======
<<<<<<< HEAD
>>>>>>> 2ae1db8d314d278a935d777400b07ed94c0622a3
trump_df =
merge(
read_csv("./datasets/trump1.csv"),
read_csv("./datasets/trump2.csv"),
all = TRUE
) %>%
select(!X1) %>%
separate(created_at, into = c("creation_date", "creation_time"), sep = " ") %>%
separate(creation_date, into = c("creation_year", "creation_month", "creation_day"), sep = "-") %>%
separate(user_join_date, into = c("join_date", "join_time"), sep = " ") %>%
separate(join_date, into = c("join_year", "join_month", "join_day"), sep = "-") %>%
mutate(hashtag = "Trump")
biden_df =
merge(
read_csv("./datasets/biden1.csv"),
read_csv("./datasets/biden2.csv"),
all = TRUE
) %>%
select(!X1) %>%
separate(created_at, into = c("creation_date", "creation_time"), sep = " ") %>%
separate(creation_date, into = c("creation_year", "creation_month", "creation_day"), sep = "-") %>%
separate(user_join_date, into = c("join_date", "join_time"), sep = " ") %>%
separate(join_date, into = c("join_year", "join_month", "join_day"), sep = "-") %>%
mutate(hashtag = "Biden")
tweets_usa =
merge(biden_df, trump_df, all = TRUE) %>%
filter(country == "United States of America")
usa_map <- map_data("state")
tweet_map <- tweets_usa %>%
group_by(state, hashtag) %>%
summarise(count = n(),
likes = sum(likes)) %>%
mutate (likes_tweets = likes*count,
region = tolower(state)) %>%
select (region, hashtag, likes_tweets) %>%
pivot_wider(names_from = "hashtag",
values_from = "likes_tweets") %>%
mutate(top = case_when(Biden>coalesce(Trump,0) ~ "Biden",
Trump>Biden ~ "Trump"))
states_tweet_map <- left_join(usa_map, tweet_map) %>%
mutate( top = recode( top,
"Trump" = "Donald Trump",
"Biden" = "Joe Biden"
))
colors <- c("dark red", "dark blue")
names(colors) = c("Donald Trump", "Joe Biden")
tweet_result_map =
ggplot(data = states_tweet_map,
aes(x = long, y = lat,
group = group, fill = top,
text = paste("State: ", state ,
"</br></br>Candidate: ", top ))) +
geom_polygon(color = "gray90", size = 0.1) +
labs(title = "Tweets Results across states") +
scale_fill_manual(values = colors) +
theme_void() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom")
ggplotly(tweet_result_map, tooltip = "text")
<<<<<<< HEAD
=======
=======
>>>>>>> 001d6a0d944c1bbe14ed89d19aa435141cdff71a
>>>>>>> 2ae1db8d314d278a935d777400b07ed94c0622a3
##Clean Polls and Regional datasets
polls_df=
read_csv("./datasets/presidential_state_toplines_2020.csv") %>%
rename(date = modeldate) %>%
mutate(date = as.Date (date, format = "%m/%d/%Y"))%>%
separate(date, into = c("year", "month", "day")) %>%
select(state, month, day, candidate_chal,
winstate_chal, voteshare_chal,
candidate_inc, winstate_inc, voteshare_inc,
margin, state_turnout,
-candidate_chal, -candidate_inc) %>%
rename(
biden_winstate = winstate_chal,
biden_voteshare = voteshare_chal,
trump_winstate = winstate_inc,
trump_voteshare = voteshare_inc,
voteshare_margin = margin,
expvote_turnout = state_turnout) %>%
arrange(state, month, day)
region_df =
read_csv("./datasets/states.csv") %>%
rename(state = State)
## Merge Polls and Regional Data
polls_merge =
merge(
polls_df,
region_df,
by = "state") %>%
arrange(state, month, day) %>%
select(-`State Code`) %>%
relocate("state", "Region")
## Clean vote proportion and number of votes for each candidate by state
exp_votes=
polls_merge %>%
select (-trump_winstate, -biden_winstate, -voteshare_margin, -Division) %>%
filter (month == 11) %>%
rename (State = state) %>%
group_by(Region, State, month) %>%
summarize(
voter_turnout = mean(expvote_turnout),
prop_Biden = mean(biden_voteshare),
prop_Trump = mean(trump_voteshare)
) %>%
pivot_longer(
prop_Biden:prop_Trump,
names_to = "Candidate",
names_prefix = "prop_",
values_to = "votes_proportion") %>%
mutate(
candidate_votes = (votes_proportion/100)*voter_turnout
)
state_winner_df =
exp_votes %>%
group_by(State) %>%
mutate(state_winner = case_when(
candidate_votes == max(candidate_votes) ~ TRUE,
candidate_votes != max(candidate_votes) ~ FALSE)
) %>%
mutate(region = tolower(State)) %>%
filter(state_winner == TRUE) %>%
select(-month) %>%
distinct()
usa_map = map_data("state")
us_election_map =
left_join(usa_map, state_winner_df) %>%
mutate( Candidate = recode(Candidate,
"Trump" = "Donald Trump",
"Biden" = "Joe Biden"
))
colors <- c("dark red", "dark blue")
names(colors) = c("Donald Trump", "Joe Biden")
election_map=
ggplot(data = us_election_map,
aes(x = long, y = lat,
group=group, fill = Candidate,
text = paste("State: ", State ,
"</br></br>Candidate: ", Candidate,
"</br>Expected Votes Proportion: ", votes_proportion))) +
geom_polygon(color = "gray90", size = 0.1) +
labs(title = "Anticipated Election Results across states") +
scale_fill_manual(values = colors) +
theme_void() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom")
ggplotly(election_map , tooltip = "text")
<<<<<<< HEAD
=======
<<<<<<< HEAD
=======
trump_df =
merge(
read_csv("./datasets/trump1.csv"),
read_csv("./datasets/trump2.csv"),
all = TRUE
) %>%
select(!X1) %>%
separate(created_at, into = c("creation_date", "creation_time"), sep = " ") %>%
separate(creation_date, into = c("creation_year", "creation_month", "creation_day"), sep = "-") %>%
separate(user_join_date, into = c("join_date", "join_time"), sep = " ") %>%
separate(join_date, into = c("join_year", "join_month", "join_day"), sep = "-") %>%
mutate(hashtag = "Trump")
biden_df =
merge(
read_csv("./datasets/biden1.csv"),
read_csv("./datasets/biden2.csv"),
all = TRUE
) %>%
select(!X1) %>%
separate(created_at, into = c("creation_date", "creation_time"), sep = " ") %>%
separate(creation_date, into = c("creation_year", "creation_month", "creation_day"), sep = "-") %>%
separate(user_join_date, into = c("join_date", "join_time"), sep = " ") %>%
separate(join_date, into = c("join_year", "join_month", "join_day"), sep = "-") %>%
mutate(hashtag = "Biden")
tweets_usa =
merge(biden_df, trump_df, all = TRUE) %>%
filter(country == "United States of America")
usa_map <- map_data("state")
tweet_map <- tweets_usa %>%
group_by(state, hashtag) %>%
summarise(count = n(),
likes = sum(likes)) %>%
mutate (likes_tweets = likes*count,
region = tolower(state)) %>%
select (region, hashtag, likes_tweets) %>%
pivot_wider(names_from = "hashtag",
values_from = "likes_tweets") %>%
mutate(top = case_when(Biden>coalesce(Trump,0) ~ "Biden",
Trump>Biden ~ "Trump"))
states_tweet_map <- left_join(usa_map, tweet_map) %>%
mutate( top = recode( top,
"Trump" = "Donald Trump",
"Biden" = "Joe Biden"
))
colors <- c("dark red", "dark blue")
names(colors) = c("Donald Trump", "Joe Biden")
tweet_result_map =
ggplot(data = states_tweet_map,
aes(x = long, y = lat,
group = group, fill = top,
text = paste("State: ", state ,
"</br></br>Candidate: ", top ))) +
geom_polygon(color = "gray90", size = 0.1) +
labs(title = "Tweets Results across states") +
scale_fill_manual(values = colors) +
theme_void() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom")
ggplotly(tweet_result_map, tooltip = "text")
>>>>>>> 001d6a0d944c1bbe14ed89d19aa435141cdff71a
>>>>>>> 2ae1db8d314d278a935d777400b07ed94c0622a3
us_election_df =
us_election_map %>%
rename(state = State)
comparison_df =
merge(usa_election_map,
states_tweet_map) %>%
merge(us_election_df) %>%
select(state, voter_turnout, Candidate,
votes_proportion, votes_proportion,
party_total,top, candidate, state_total)
comparison_df %>%
select(state, voter_turnout, Candidate,
candidate, state_total) %>%
rename(poll_winner = Candidate,
election_winner = candidate) %>%
filter(poll_winner != election_winner) %>%
group_by(state) %>%
mutate(poll_vs_result = voter_turnout - state_total) %>%
select(-c(voter_turnout, state_total)) %>%
unique() %>%
knitr::kable()
| state | poll_winner | election_winner | poll_vs_result |
|---|---|---|---|
| North Carolina | Joe Biden | Donald Trump | 195821.7 |
| Florida | Joe Biden | Donald Trump | 110595.3 |
From the computation, we can see that the poll mis-anticipated the winner of the election and the voter turnout 100-200 thousand in North Carolina and Florida respectively. Even though Donald Trump won these 2 states, we can see that the votes he got actually shrank compared to anticipation.
comparison_df %>%
select(state, votes_proportion, Candidate,
party_total,candidate, state_total) %>%
rename(poll_winner = Candidate,
election_winner = candidate) %>%
filter(poll_winner != election_winner) %>%
group_by(state) %>%
mutate(votes_proportion = round(votes_proportion, 2)) %>%
mutate(prop_vote = round(party_total/state_total, 4) * 100,
poll_vs_result = votes_proportion - prop_vote) %>%
select(-c(party_total, state_total)) %>%
unique() %>%
knitr::kable()
| state | votes_proportion | poll_winner | election_winner | prop_vote | poll_vs_result |
|---|---|---|---|---|---|
| North Carolina | 50.53 | Joe Biden | Donald Trump | 49.93 | 0.6 |
| Florida | 50.82 | Joe Biden | Donald Trump | 51.22 | -0.4 |
In proportion, we can see that the poll anticipated that Joe Biden would win North Carolina with 50.53% votes, and Florida with 50.82 votes. In fact, however, Donald Trump won North Carolina with 49.93% votes and Florida with 51.22% votes.
comparison_df %>%
select(state,candidate, top) %>%
rename(election_winner = candidate,
top_tweet = top) %>%
filter(top_tweet != election_winner) %>%
group_by(state) %>%
unique() %>%
knitr::kable()
| state | election_winner | top_tweet |
|---|---|---|
| Oklahoma | Donald Trump | Joe Biden |
| Texas | Donald Trump | Joe Biden |
| Nebraska | Donald Trump | Joe Biden |
| North Dakota | Donald Trump | Joe Biden |
| South Dakota | Donald Trump | Joe Biden |
| Montana | Donald Trump | Joe Biden |
| Utah | Donald Trump | Joe Biden |
| California | Joe Biden | Donald Trump |
| Oregon | Joe Biden | Donald Trump |
| Washington | Joe Biden | Donald Trump |
| Maine | Joe Biden | Donald Trump |
| Rhode Island | Joe Biden | Donald Trump |
| Vermont | Joe Biden | Donald Trump |
| Connecticut | Joe Biden | Donald Trump |
| Pennsylvania | Joe Biden | Donald Trump |
| Maryland | Joe Biden | Donald Trump |
| Virginia | Joe Biden | Donald Trump |
| District of Columbia | Joe Biden | Donald Trump |
| West Virginia | Donald Trump | Joe Biden |
| South Carolina | Donald Trump | Joe Biden |
| Ohio | Donald Trump | Joe Biden |
| Michigan | Joe Biden | Donald Trump |
| Indiana | Donald Trump | Joe Biden |
| Illinois | Joe Biden | Donald Trump |
| Mississippi | Donald Trump | Joe Biden |
| Louisiana | Donald Trump | Joe Biden |
| Iowa | Donald Trump | Joe Biden |
We found there are 27 states with tweets that are different with the election result. Perhaps the tweets relating to specific candidate are not necessarily positive but negative. We can explore the tweets with the word cloud app we built and obtain more insights.
comparison_df %>%
select(state, Candidate,candidate, top) %>%
rename(poll_winner = Candidate,
election_winner = candidate,
top_tweet = top) %>%
filter(poll_winner != election_winner |
top_tweet != election_winner |
poll_winner != top_tweet) %>%
group_by(state) %>%
unique() %>%
knitr::kable()
| state | poll_winner | election_winner | top_tweet |
|---|---|---|---|
| Oklahoma | Donald Trump | Donald Trump | Joe Biden |
| Texas | Donald Trump | Donald Trump | Joe Biden |
| Nebraska | Donald Trump | Donald Trump | Joe Biden |
| North Dakota | Donald Trump | Donald Trump | Joe Biden |
| South Dakota | Donald Trump | Donald Trump | Joe Biden |
| Montana | Donald Trump | Donald Trump | Joe Biden |
| Utah | Donald Trump | Donald Trump | Joe Biden |
| California | Joe Biden | Joe Biden | Donald Trump |
| Oregon | Joe Biden | Joe Biden | Donald Trump |
| Washington | Joe Biden | Joe Biden | Donald Trump |
| Maine | Joe Biden | Joe Biden | Donald Trump |
| Rhode Island | Joe Biden | Joe Biden | Donald Trump |
| Vermont | Joe Biden | Joe Biden | Donald Trump |
| Connecticut | Joe Biden | Joe Biden | Donald Trump |
| Pennsylvania | Joe Biden | Joe Biden | Donald Trump |
| Maryland | Joe Biden | Joe Biden | Donald Trump |
| Virginia | Joe Biden | Joe Biden | Donald Trump |
| North Carolina | Joe Biden | Donald Trump | Donald Trump |
| District of Columbia | Joe Biden | Joe Biden | Donald Trump |
| West Virginia | Donald Trump | Donald Trump | Joe Biden |
| South Carolina | Donald Trump | Donald Trump | Joe Biden |
| Florida | Joe Biden | Donald Trump | Donald Trump |
| Ohio | Donald Trump | Donald Trump | Joe Biden |
| Michigan | Joe Biden | Joe Biden | Donald Trump |
| Indiana | Donald Trump | Donald Trump | Joe Biden |
| Illinois | Joe Biden | Joe Biden | Donald Trump |
| Mississippi | Donald Trump | Donald Trump | Joe Biden |
| Louisiana | Donald Trump | Donald Trump | Joe Biden |
| Iowa | Donald Trump | Donald Trump | Joe Biden |
To sum up the different among three datasets, poll still performed better than the popular tweets analysis. Maybe it is because we haven’t learned how to let the machine distinguish the meaning of words. Once the machine can distinguish the meaning of the words, it perhaps can provide more insights into the election.